/********************************************************************************************** 
 *______________________________________SAMPLE SELECTION______________________________________*
 **********************************************************************************************/

/*
PURPOSE: 

    This code creates the file Symbol_Universe_2015. This file is a list of symbols collected 
    in the Daily TAQ Trades dataset in 2015, with symbol characteristics such as 
    average daily volume, average price, which exchange the symbol is listed on, etc.
    The main purpose of this script is to identify symbols that follow certain regularity
    conditions over this time period, for example, symbols that trade continuously
    throughout the year or have an average daily volume > 1 million shares. We label
    these symbols with the flag "f_Sample" and these will be the symbols used in most
    of the analysis for this paper.
    
    The output is a symbol level dataset containing symbol characteristics, flags, and 
    sample indicators (e.g. a row for SPY, and flags for whether it is listed on NYSE,
    experienced a listing change, is included in our sample, etc.)
    
PRIMARY INPUT: 
    
    "Volume_2015" is the output of the "Volume.sas". This is a dataset containing
    the volume traded on a particular symbol-date-exchange combination (e.g. the volume traded on NYSE
    for SPY on March 8, 2015). It contains an observation for any symbol-date-exchange combination
    with non-zero volume.
    
    The Daily TAQ Master file is a dataset containing a row for symbol-dates present in the
    Daily TAQ Trades and Quotes dataset (e.g. SPY on March 8, 2015). Each row contains the
    exchange at which the symbol was listed on that day.
    
APPROACH: 

    We first load the volume dataset to produce a list of all symbols and
    a list of all possible symbol-date combinations in the TAQ dataset. 
    From these datasets we produce symbol-level datasets containing the following variables:
    
    Listing Change: We load the Daily TAQ Master file, which lists the exchange each symbol
    is listed on for every trading day. Then we collapse the dataset to the symbol level
    and construct the flag "f_Listing Change" which indicates if the symbol has ever changed 
    its exchange during the year.
    
    Non-Trading Days: Using the dataset containing all possible symbol-date combinations
    we create the flag "f_Non_Trading_Days" which indicates if the symbol has ever had
    zero trading for a particular day.
    
    ADV: Using the volume dataset, we compute the average daily volume (ADV) for each symbol
    in shares and dollars over all the days in which the symbol experienced non-zero trading.
    We create the flag "f_Volume_S_1M" to identify the symbols with an average daily
    share volume greater than 1 million shares. We also create the flag "f_Volume_D_10M" to
    identify symbols with an average daily dollar volume greater than $10 Million.
    
    Penny Stock: Using the computed ADV, we divide the average daily price in dollars by the
    average daily price in shares to get the average price. We assign each symbol f_Penny_Stock
    if its average price is below $1.
    
    Then we merge these datasets into one symbol-level dataset. We then create the sample flag
    with the following conditions (the "~" indicates that this flag is false for the symbol):
    
    ~f_Listing_Change (no listing change)
    ~f_Non_Trading_Days (no interruption in trading)
     f_Volume_S_1M (share volume > 1 Million shares)
     f_Volume_D_10M (dollar volume > $10 Million shares)
    ~f_Penny_Stock (average price > $1)
    
    Afterwards we rank the symbols in the sample according to their ADV (in shares) and we 
    identify the top N of them, where N can be customized. By default we identify the top 100
    symbols.
    
SUPPLEMENTAL INPUT FILES: 
    
    "Server_Directories.xlsx"
    "Holidays.xlsx"
    
INSTRUCTION: 

  In the "Global Variables" section, verify that "year_input" is '2015'.
  Then run from the SAS Studio website.
*/

/*
 ****************************************************************************************
 * ______________________________________INITIALIZE______________________________________
 ****************************************************************************************
 */
/*
 * WRDS LIBRARY:
 *
 * This sets up the wharton library, which allows us to access the CRSP and TAQ datasets.
 */
%include '/wrds/lib/utility/wrdslib.sas';
options mprint;

/*
 * IMPORT INPUT PARAMETERS:
 * 
 * Make sure to enter the path to your input directory below.
 */
proc import datafile="/home/uchicago/eche/sasuser.v94/WRDS/Input/Server_Directories" 
  out= server_directories 
  dbms = xlsx 
  replace;
run;

data directory_import;
  set server_directories;
  
  if directory = "user_directory" then do; call symput('userdir', trim(path)); end;
  if directory = "volume_directory" then do; call symput('voldir', trim(path)); end;
  if directory = "temp_directory" then do; call symput('tempdir', trim(path)); end;
run;

/**
 * Directories:
 *
 * Sets up the directories according to the paths in the "Server_Directories.xlsx" file.
 */
libname userdir &userdir;
libname voldir &voldir;
libname tempdir &tempdir;

/* Set user_path for import/export 
 * We need to dequote the path passed in from "Server_Directories.xlsx"
 */
%let user_path  = %qsysfunc(dequote(&userdir));
%put &user_path;

/**********************************************************************************************
 *______________________________________GLOBAL VARIABLES______________________________________*
 **********************************************************************************************/

/* Top N Symbols:
 * This parameter chooses how many of the top symbols in terms of ADV to identify.
 */
%let top_n_input = 100;

/* 
 * Year Input:
 * 
 * When you run the wrapper, specify the year you wish to extract volume for with the format 
 * specified in the preamble. When running it from SAS Studio, uncomment the line right below 
 * and use the same format for the year in the wrapper for sysparm here.
 */
%let year_input = 2015;

/***********************************************************************************
 *______________________________________MACRO______________________________________*
 ***********************************************************************************/

/*
 * _MACRO_ : Load_Volume(Year)
 *
 * PARAM : Year: Specifies the sample year from which we extracted TAQ Volume data.
 * 
 * This macro first loads the symbol-date-exchange volume dataset. It extracts all
 * the symbols and dates found in this dataset.
 * 
 * The two outputs that are used throughout this script are "Symbol_Date_Volume",
 * which contains a volume record for all possible symbol-date combinations, and
 * "Key_Symbol" which is a list of all symbols in the volume data.
 *
 */
%Macro Load_Volume(Year);
  
  /** 
   * INPUT: Sym_Ex_Date_Volume, symbol-date-exchange volume
   * 
   * Load the symbol-date-exchange volume dataset.
   * 
   * OUTPUT: Sym_Ex_Date_Volume, symbol-date-exchange volume
   */
  data Sym_Ex_Date_Volume;
    set tempdir.volume_&Year.;
  run;

  /**
   * INPUT: Volume_Keys, table of symbol-date-exchange observations in the volume data
   * 
   * Generate a list for the symbols and dates present in the volume data.
   * 
   * OUTPUT: Key_Symbol, symbol directory for TAQ data
   *       Key_Date, date directory for TAQ data
   */
  proc sql;
    create table Key_Symbol as select distinct sym_root, sym_suffix, 1 as key from Sym_Ex_Date_Volume;
    create table Key_Date   as select distinct date,                 1 as key from Sym_Ex_Date_Volume;
  quit;
  
  /**
   * INPUTA: Key_Symbol, list of symbols in volume data
   * INPUTB: Key_Date, list of dates in volume data
   * 
   * Generate a table with every symbol-data-date combination.
   * How this works is that we do a "full join" between the "key" variables which are all 1's.
   * What "full join" does is produce an entry for each matching key, and since all keys match, 
   * this produces a combination of every symbol-date.
   * 
   * OUTPUT: Key_Symbol_Date, list of all possible symbol-date combinations from symbols and 
   *  dates in the TAQ Trades Data.
   */
  proc sql;
    create table Key_Symbol_Date
    as select
    a.sym_root, 
    a.sym_suffix,
    trim(a.sym_root) || trim(a.sym_suffix) as symbol,
    b.date
    from Key_Symbol as a
    full join Key_Date as b
    on a.key = b.key
    order by sym_root, sym_suffix, date;
  quit;
    
  /* 
   * INPUTA: Key_Symbol_Date, list of all possible symbol-date combinations in the TAQ Trades Data
   * INPUTB: Sym_Ex_Date_Volume, symbol-date-exchange volume
   * 
   * Merge Volume onto the list of all possible symbol-date combinations.
   * If a particular symbol did not trade on a day, the volume for that record is set to 0.
   * 
   * OUTPUT: Volume_All_Symbol_Dates, symbol-date-exchange volume for all possible symbol-date
   *  combinations
   */
  proc sql;
    create table Volume_All_Symbol_Dates
    as select
    a.*,
    b.ex,
    case when b.Volume_S = . then 0 else b.Volume_S end as Volume_S,
    case when b.Volume_D = . then 0 else b.Volume_D end as Volume_D
    from Key_Symbol_Date as a
    left join Sym_Ex_Date_Volume as b
    on a.sym_root = b.sym_root &
       a.sym_suffix = b.sym_suffix &
       a.date = b.date
    order by sym_root, sym_suffix, date;
  quit;
  
  /*
   * INPUT: Volume_All_Symbol_Dates, symbol-date-exchange volume for all possible symbol-date
   * combinations
   * 
   * Collapse symbol-date-exchange volume to the symbol-date level. 
   * Sum the volume for each symbol-date across all exchanges.
   * 
   * OUTPUT: Symbol_Date_Volume, symbol-date volume
   */
  proc sql;
    create table Symbol_Date_Volume
    as select
    sym_root,
    sym_suffix,
    date,
    sum(Volume_S) as Volume_S,
    sum(Volume_D) as Volume_D
    from Volume_All_Symbol_Dates
    group by sym_root, sym_suffix, date
    order by sym_root, sym_suffix, date;
  quit;

%Mend;

/*
 * _MACRO_ : Listing_Change
 * 
 * REQUIRES : Load_Volume
 *
 * PARAM : Year: Specifies the year for the TAQ Master dataset
 *
 * This macro loads the TAQ Master file, which is a list of all the 
 * TAQ symbols with the exchange they were listed at every date. 
 * We check whether a symbol has ever changed its listing, and if so, we mark it with f_Listing_Change.
 * We also use the Listing_Market dataset to mark symbols listed on NYSE with f_NYSE_Listed.
 *
 * OUTPUT: Symbol_Listing, a list of symbols in the volume dataset with the following flags:
 *         f_NYSE_Listing : if the symbol is listed on NYSE
 *         f_Listing_Change : if the symbol has ever changed the exchange its listed on
 */

%Macro Listing_Change(Year);
  /**
   * INPUT: Key_Symbol, a list of all symbols in the volume data
   * 
   * Concatenates the symbol root and symbol suffix into one symbol variable.
   * Then sorts by symbol. This is required because the TAQ Master File
   * sometimes includes the symbol suffix in the symbol root. The only reliable
   * way to merge with that dataset is to merge on the full symbol.
   * 
   * OUTPUT: Symbol_Names, symbol level directory of TAQ symbols
   */
  data Volume_Symbols;
    set Key_Symbol;
      
    symbol_id = trim(sym_root) || trim(sym_suffix);
      
    drop key;
    proc sort; by symbol_id;
  run;
  
  /* 
   * In the TAQ Master File, there are a few symbols for which the symbol root
   * contains both the symbol root and the symbol suffix in the Daily TAQ Trades File.
   * 
   * The symbols are:
   * GOOGL (GOOG.L): Google Class A
   * DISCK (DISC.K): Discovery Inc. Series C
   * DISKA (DISC.A): Discovery Inc. Series A
   * LBTYK (LBTY.K): Liberty Global Plc Class C
   * LBTYA (LBTY.A): Liberty Global Plc Class A
   * CMCSA (CMCS.A): Comcast Corporation
   * 
   * There is one case in which concatenating the symbol root and suffix
   * results in two different securities having the same symbol (HUB.B and HUBB).
   * We find such duplicates and concatenate the symbol suffix of those securities
   * onto their symbol variable.
   *
   * This will result in HUB.B having the symbol "HUBB_B" and HUBB will have
   * the symbol "HUBB_". This symbol variable is only used to merge with the 
   * TAQ Master file, so all that matters is that we have a unique identifier
   * for each security.
   */
  data Volume_Symbols;
    set Volume_Symbols;
    
    /* 
     * If the symbol is unique, it is the first and last row of that symbol.
     * If there are duplicate symbols, then there are two rows so the first and last rows
     * will be different.
     */
    f_duplicate_symbol =  ~(first.symbol_id and last.symbol_id);
    if f_duplicate_symbol then symbol_id = trim(symbol_id) || ("_" || trim(sym_suffix));
    
    by symbol_id;
  run;
  
  /*
   * INPUT: TAQ Master file, symbol-date level dataset listing which symbol-date combinations 
   *    are present in the TAQ data, and symbol information such as primary listing and industry code.
   * 
   * We load the TAQ Master file and create the symbol variable as we do for "Volume_Symbols".
   * We only keep the variables that we use (symbol, symbol root, symbol suffix, listed exchange)
   * and we sort this dataset by symbol and date.
   * 
   * OUTPUT: Listing_Exchange, Symbol-date level flags that says where the security is listed at each date
   */
  data Symbol_Date_Listed_Exchange;
    set taqmsec.mastm_&Year.:;
    
    sym_root = symbol_root;
    sym_suffix = symbol_suffix;
    symbol_id = trim(sym_root) || trim(sym_suffix);
    
    keep date symbol_id sym_root sym_suffix listed_market;
    proc sort; 
    by symbol_id sym_root sym_suffix date;
  run;

  /**
   * INPUT: Listing_Exchange, symbol-date level characteristics, says which exchange the security is listed
   * 
   * We collapse the symbol-date dataset into a symbol dataset. In the process we create the flag "f_Listing_Change"
   * that indicates if the symbol has had more than 1 exchange on which it was listed during 2015.
   * 
   * OUTPUT: Listing_Change_Flags, symbol level dataset with f_Listing_Change
   */ 
  proc sql;
    create table Listing_Change
    as select
    symbol_id,
    sym_root,
    sym_suffix,
    count(distinct(listed_market)) > 1 as f_Listing_Change
    from Symbol_Date_Listed_Exchange
    group by symbol_id, sym_root, sym_suffix;
  quit;
  

  /*
   * INPUT: Symbol_Date_Listed_Exchange, symbol-date level dataset of which exchange the securityis listed
   * 
   * Records a list of the symbols in the Listing_Exchange dataset, 
   * and where they were listed on the last day of 2015.
   * 
   * OUTPUT: Symbol_Listed_Exchange, symbol level characteristics, says which exchange the security is listed
   */
  data Symbol_Listed_Exchange;
    set Symbol_Date_Listed_Exchange;
    if last.sym_suffix;
    keep symbol_id sym_root sym_suffix listed_market;
    by symbol_id sym_root sym_suffix;
  run;

  /**
   * INPUTA: Listing_Change, symbol-level dataset with f_Listing_Change
   * INPUTB: Symbol_Listed_Exchange, symbol-level dataset with the exchange 
   * the security was listed on the last day of 2015
   * 
   * Merge Symbol_Listed_Exchange onto Listing_Change. They both should have the same symbols
   * since they are both from the TAQ Master file.
   *
   * Creates a flag if the symbol is listed on NYSE in its last observation of 2015.
   * 
   * OUTPUT: Listing_Change_Exchange, symbol-level flags with f_Listing_Change and the listed exchange
   */
  proc sql;
    create table Listing_Change_Exchange
    as select
    a.*,
    b.listed_market,
    b.listed_market = "N" as f_NYSE_Listed
    from Listing_Change as a
    left join Symbol_Listed_Exchange as b
    on a.symbol_id = b.symbol_id & 
       a.sym_root   = b.sym_root &
       a.sym_suffix = b.sym_suffix
    order by symbol_id;
  quit;
  
  /* 
   * Likewise with "Volume_Symbols", there is one case in which concatenating the symbol root and suffix
   * results in two different securities having the same symbol (HUB.B and HUBB).
   * We find such duplicates and concatenate the symbol suffix of those securities
   * onto their symbol variable.
   *
   * This will result in HUB.B having the symbol "HUBB_B" and HUBB will have
   * the symbol "HUBB_". This symbol variable is only used to merge with the 
   * TAQ Master file, so all that matters is that we have a unique identifier
   * for each security.
   */
  data Listing_Change_Exchange;
    set Listing_Change_Exchange;
    
    /* 
     * If the symbol is unique, it is the first and last row of that symbol.
     * If there are duplicate symbols, then there are two rows so the first and last rows
     * will be different.
     */
    f_duplicate_symbol =  ~(first.symbol_id and last.symbol_id);
    if f_duplicate_symbol then symbol_id = trim(symbol_id) || ("_" || trim(sym_suffix));

    by symbol_id;
  run; 

  /*
   * INPUTA: Volume_Symbols, symbol-level directory of TAQ symbols
   * INPUTB: Listing_Change_Exchange, symbol-level flags that says where the security is listed
   * 
   * Merge Listing_Change_Exchange onto Volume_Symbols. This will create a dataset
   * containing the f_Listing_Change and f_NYSE_Listed flags for symbols in the volume data.
   *
   * We merge the datasets on matching symbol variables instead of "sym_root" and "sym_suffix"
   * as we typically do in this script. This is because the TAQ Master dataset
   * 
   * OUTPUT: Symbol_Listing, symbol-level names and nyse_listed flag
   */
  proc sql;
    create table Symbol_Listing
    as select
    a.*,
    b.f_Listing_Change,
    b.f_NYSE_Listed,
    b.listed_market
    from Volume_Symbols as a
    left join Listing_Change_Exchange as b
    on a.symbol_id = b.symbol_id
    order by sym_root, sym_suffix;
  quit;
  
  /* Remove helper variables */
  data Symbol_Listing;
    set Symbol_Listing;
    drop symbol_id f_duplicate_symbol;
  run;
%Mend;

/*
 * _MACRO_ : Non_Trading_Days
 *
 * REQUIRES : Load_Volume
 *
 * Using "Symbol_Date_Volume" dataset, which contains all possible symbol-date combinations and the volume traded
 * on those symbol-dates to find, we find for each symbol, how many days does the symbol experience zero trading.
 *
 * If the symbol has 1 or more zero-trading days, then we raise the flag "f_Non_Trading_Days". Note that
 * the dates present in "Symbol_Date_Volume" are dates with non-zero trading within the entire market.
 * We only raise f_Non_Trading_Days if a symbol does not trade a day when other symbols are.
 *
 * OUTPUT: Symbol_Activity, a list of symbols in the volume dataset with the following flag:
 *         f_Non_Trading_Days: if the symbol has zero volume traded for at least 1 trading day during the year
 */
%Macro Non_Trading_Days;
    /*
     * INPUT: Symbol_Date_Volume, symbol-date-exchange volume for all possible symbol-date
     * combinations.
     * 
     * Add up daily volume across exchanges for each symbol.
     * 
     * OUTPUT: Symbol_Activity, symbol-level dataset with f_Non_Trading_Days
     */
    proc sql;
      create table Symbol_Activity
      as select
      sym_root,
      sym_suffix,
      sum(Volume_S > 0) as n_Trading_Days,
      sum(Volume_S = 0) as n_Non_Trading_Days,
      calculated n_Non_Trading_Days > 0 as f_Non_Trading_Days
      from Symbol_Date_Volume
      group by sym_root, sym_suffix
      order by sym_root, sym_suffix;
    quit;
    
%Mend;

/*
 * _MACRO_ : Average_Daily_Volume
 * 
 * REQUIRES : Load_Volume
 *
 * Using "Symbol_Date_Volume" dataset, which contains the volume traded on every symbol-date in the volume dataset,
 * we calculate the average daily volume transacted on each exchange.
 *
 * OUTPUT: Symbol_ADV, a list of symbols in the volume dataset with the following flag:
 *         f_Volume_S_1M : indicates whether the average daily volume is greater than 1 million (shares)
 *         f_Volume_D_10M : indicates whether the average daily volume is greater than 1 million (dollars)
 */
%Macro Average_Daily_Volume;

    /**
     * INPUT: Symbol_Date_Volume, symbol-date volume dataset
     * 
     * For each symbol in "Symbol_Date_Volume" volume data, we calculate the average daily volume
     * over the days that experienced trading throughout the observation period.
     * 
     * Then we flag symbols with average daily volume below 1 million shares. 
     * Seperately flag symbols with average daily volume traded below 10 million dollars.
     * 
     * OUTPUT: Symbol_ADV, symbol level average daily volume
     */
    proc sql;
      create table Symbol_ADV
      as select
      sym_root,
      sym_suffix,
      sum(Volume_S > 0) as n_Trading_Days,
      sum(Volume_S = 0) as n_Non_Trading_Days,
      sum(Volume_S) / calculated n_Trading_Days as ADV_S,
      sum(Volume_D) / calculated n_Trading_Days as ADV_D,
      calculated ADV_S > 1000000  as f_Volume_S_1M,
      calculated ADV_D > 10000000 as f_Volume_D_10M
      from Symbol_Date_Volume as a
      group by sym_root, sym_suffix
      order by sym_root, sym_suffix;
    quit;
%Mend;


/*
 * _MACRO_ : Penny_Stock
 * 
 * REQUIRES : Load_Volume
 *
 * We compute the average price for each symbol by dividing the total volume (in dollars) across
 * all days in 2015 by total volume (in shares). Assign all symbols with an average price below $1 
 * with the "f_Penny_Stock" flag.
 *
 * OUTPUT: Symbol_Price, a list of symbols in the volume dataset with the following flag:
 *         f_Penny_Stock : if the average price is less than $1
 *         c_Price_Standard : "High Price" if the average price is above $50
 *                            "Mid Price" if the average price is above $20
 *                            "Low Price" if the average price is above $1 and below $20
 */

%Macro Penny_Stock;
  /*
   * INPUT: Symbol_Date_Volume, symbol-date volume dataset
   * 
   * Calculate average price and assign f_Penny_Stock for securities with an average price below $1.
   * 
   * OUTPUT: Symbol_Price, symbol-level average price flags
   */
  proc sql;
    create table Symbol_Price
        as select
        sym_root,
        sym_suffix,
        sum(Volume_D)  / sum(Volume_S)  as Avg_Price,
        case when calculated Avg_Price <= 1 then 1 else 0 end as f_Penny_Stock,
        case when calculated Avg_Price  > 50 then "High Price"
             when calculated Avg_Price <= 50 &
                  calculated Avg_Price  > 20 then "Mid Price"
             when calculated Avg_Price <= 20 &
                  calculated Avg_Price  > 1  then "Low Price"
             when calculated Avg_Price  < 1  then "Penny Stock"
             else "" end as c_Price_Standard
        from Symbol_Date_Volume
        group by sym_root, sym_suffix
        order by sym_root, sym_suffix;
  quit;
%Mend;

/*
 * _MACRO_ : Sample_Selection
 *
 * REQUIRES : Load_Volume
 *            Listing_Change
 *            Non_Trading_Days
 *            Average_Daily_Volume
 *            Penny_Stock
 *
 * First this macro runs all the previous macros. The result is 4 symbol-level datasets
 * each with flags and variables described by their names. This macro merges the outputs
 * to obtain a symbol-level dataset with listing change, average daily volume, average price,
 * and trading interruption flags.
 * 
 * Finally, this macro creates the sample indicator ("f_Sample") using the flags created before to
 * filter out symbols. Then we identify the top "N" symbols from average daily share volume
 * with the flag "f_Sample_T&top_n" where "&top_n" is a reference to the parameter "top_n"
 * that determines how many of the top symbols we identify. The default is top 100.
 *
 * OUTPUT: Symbol_Universe_2015, symbol-level dataset with flags and sample indicators
 */
%Macro Sample_Selection(year, top_n);
  
  /* Load volume datasets */
  %Load_Volume(&Year);

  /* Run symbol flag macros */
  %Listing_Change(&Year);
  %Non_Trading_Days;
  %Average_Daily_Volume;
  %Penny_Stock;

  /* 
   * INPUTA: Symbol_Listing, a list of symbols with f_Listing_Change and f_NYSE_Listed
   * INPUTB: Symbol_Activity, a list of symbols with f_Non_Trading_Days
   * INPUTC: Symbol_ADV, a list of symbols with f_Volume_S_1M and f_Volume_D_10M
   * INPUTD: Symbol_Price, a list of symbols with f_Penny_Stock
   * 
   * Merge each of the symbol-level datasets to produce a dataset that contains all the flags created
   * in this script.
   *          
   * OUTPUT: TAQ_Flags, symbol-level dataset with TAQ flags
   */
  data TAQ_Flags;
    merge Symbol_Listing(in = a)
          Symbol_Activity(in = b)
          Symbol_ADV(in = c)
          Symbol_Price(in = d);
    
    by sym_root sym_suffix;
  run;
  
  /* 
   * INPUT: TAQ_Flags, symbol-level dataset with TAQ flags
   * 
   * Assign the sample indicator to all symbols with no non-trading days (trades for every 
   * trading day in 2015), share volume greater than 1 million shares, dollar volume
   * greater than 10 million dollars, an average price > $1, and did not experience any
   * listing change throughout the year.
   *
   * Sort the symbols in descending order of Average Daily Volume (in shares).
   *          
   * OUTPUT: Sample_Flag, symbol-level dataset with sample indicator
   */        
  data Sample_Flag;
    set TAQ_Flags;
    
    f_Sample = ~f_Non_Trading_Days
             &  f_Volume_S_1M
             &  f_Volume_D_10M
             & ~f_Penny_Stock
             & ~f_Listing_Change;
    
    proc sort; by descending ADV_S;
  run;
  
  /* 
   * INPUT: Sample_Flag, symbol-level dataset with sample indicator
   * 
   * Label the first N symbols (N specified by the parameter "Top_N") of the symbols in the sample
   * (labelled by f_Sample). These will be the top n symbols (in terms of ADV) in the sample
   * since we sorted the symbols in decreasing order of ADV.
   *          
   * OUTPUT: Top_N_Symbols, symbol-level dataset for only the symbols 
   * in the sample with Top N indicator
   */   
  data Top_N_Symbols;
    set Sample_Flag;
    where f_Sample = 1;
    
    f_Sample_T&top_n. = _n_ <= &top_n;
  run;

  /* 
   * INPUTA: Sample_Flag, symbol-level dataset with sample indicator
   * INPUTB: Top_N_Symbols, symbol-level dataset for only the symbols 
   * in the sample with Top N indicator
   * 
   * Merge Top_N_Symbols, which contains all the symbols in the sample with the "f_Sample_T&top_n" flag,
   * onto Sample_Flag. This will assign the f_Sample_T&top_n flag to all symbols in the dataset
   * (0 for all symbols not in the symbol sample).
   * 
   * Export this to the temp directory.
   *          
   * OUTPUT: Symbol_Universe_2015, symbol level characteristics and volume (Entire Sample)
   */
  proc sql;
    create table tempdir.Symbol_Universe_&year.
    as select
    a.*,
    trim(a.sym_root) || trim(a.sym_suffix) as symbol,
    case when b.f_Sample_T&top_n. is null then 0 else b.f_Sample_T&top_n. end as f_Sample_T&top_n.
    from Sample_Flag as a
    left join Top_N_Symbols as b
    on a.sym_root = b.sym_root &
         a.sym_suffix = b.sym_suffix
    order by sym_root, sym_suffix;
  run;
  
  /* Export Symbol Universe */
  proc export data = tempdir.Symbol_Universe_&year.
        outfile = "&user_path./Symbol_Universe_&year..csv"
        dbms = csv
    replace;
    
%Mend;

/*
 *********************************************************************************
 * ______________________________________RUN______________________________________
 *********************************************************************************
 */

%Sample_Selection(&year_input., &top_n_input.);
